Problem Set 4

Lionel Chambon

Loading packages:

library(ggplot2)
library(RColorBrewer)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
library(kableExtra)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks plotly::filter(), stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt

Question 1

x_1 <- runif(1000, min = 0, max = 1)

x_1_plot <- as.data.frame(x_1) %>% 
  ggplot(aes(x = x_1)) +
  geom_histogram(binwidth=0.01, fill = '#34495E') +
  theme(panel.background = 
            element_rect(fill ="white"),
        axis.title.y = element_text(color = '#17202A'),
        axis.title.x = element_text(color = '#17202A')) +

  xlab("Distribution of x_1") +
  ylab("Count") 
  
ggplotly(x_1_plot) 
### Why do I have to use plotly here? Without it, R generates the plot but does
### not display it?
u_plus_1 <- rchisq(1000, df =1)

u_plus_1_plot <- as.data.frame(u_plus_1) %>%
  ggplot(aes( x = u_plus_1)) +
  geom_histogram(binwidth = 0.1, fill = '#34495E') +
  theme(panel.background = 
          element_rect(fill = "white"),
        axis.title.y = element_text(color = '#17202A'),
        axis.title.x = element_text(color = '#17202A')) +
  
  xlab("Distribution of u+1") +
  ylab("Count")

ggplotly(u_plus_1_plot)

###Question 2

  # a)
  f2_a <- function(N = 5) 
  {
    x <- rchisq(df = 1, N) 
    return(c(mean(x), sd(x))) # returns a vector containing the mean and the standard deviation of the generated values.
  }
  
  # b) 
  f2_b <- function( n=10000,N = 5 ){ 
    MEAN <- c() # To create a MEAN vector
    SD <- c() # To create a SD vector
    for (i in 1:n) 
    {
      df <- f2_a(N) 
      MEAN[i]  <- df[1] # It attributes the mean to the vector 'MEAN'
      SD[i] <- df[2] # It attributes the standard deviation to the vector 'SD'
    }
    return (data.frame(MEAN,SD))
    # We get the desired dataset in a dataframe format. 
  }
  simulated_chi <- f2_b()
  
  # c) 

simulation_means_plot <- as.data.frame(simulated_chi[, 1]) %>%
  ggplot(aes(x = simulated_chi[, 1])) +
  geom_histogram(binwidth=0.3, fill = '#34495E') +
  theme(panel.background = 
            element_rect(fill ="white"),
        axis.title.y = element_text(color = '#17202A'),
        axis.title.x = element_text(color = '#17202A')) +

  xlab("Distribution of Means") +
  ylab("Count") 
  
ggplotly(simulation_means_plot) 

Question 3

# N = 10 : 
  q3a <- f2_b(N=10)
  summary(q3a[,1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.07237 0.66471 0.93040 0.99057 1.24218 4.27836
# N = 100 : 
  q3b <- f2_b(N=100)
  summary(q3b[,1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5789  0.9000  0.9887  0.9976  1.0871  1.6027
# N = 1000 : 
  q3c <- f2_b(N=1000)
  summary(q3c[,1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.8243  0.9699  0.9998  1.0005  1.0307  1.1773
# We observe that all summary statistics tend to 1 the greater the N. 

###Question 4

sd_n_10 <- sd(simulated_chi[, 1])
print(sd_n_10)
## [1] 0.6325471
multi_4a <- sd_n_10*(sqrt(10)/sqrt(100))
print(multi_4a)
## [1] 0.2000289
multi_4b <- sd_n_10*(sqrt(10)/sqrt(1000))
print(multi_4b)
## [1] 0.06325471
multi_4c <- sd_n_10*(sqrt(10)/sqrt(3500000))
print(multi_4c)
## [1] 0.0010692

The second multiplication divides the standard deviation by 10. Through guessing I obtain a sample size of N approx. = 3 500 000.

Question 5

# N = 10 : 
  q4a <- f2_b(N=10)
  hist(q3a[,1])

# N = 100 : 
  q4b <- f2_b(N=100)
  hist(q3b[,1])

# N = 1000 : 
  q4c <- f2_b(N=1000)
  hist(q3c[,1])

# We observe the mean becomes approximately more and more
# normally distributed.
# Playing around with ggplot2 for my own amusement:

#a)

q4a_plot <- as.data.frame(q4a) %>%
  ggplot(aes(x = MEAN)) +
  geom_histogram(binwidth=0.05, fill = '#34495E') +
  theme(panel.background = 
            element_rect(fill ="white"),
        axis.title.y = element_text(color = '#17202A'),
        axis.title.x = element_text(color = '#17202A')) +

  xlab("Distribution of Means") +
  ylab("Count") 
  
ggplotly(q4a_plot) 
#b)

q4b_plot <- as.data.frame(q4b) %>%
  ggplot(aes(x = MEAN)) +
  geom_histogram(binwidth=0.05, fill = '#34495E') +
  theme(panel.background = 
            element_rect(fill ="white"),
        axis.title.y = element_text(color = '#17202A'),
        axis.title.x = element_text(color = '#17202A')) +

  xlab("Distribution of Means") +
  ylab("Count") 
  
ggplotly(q4b_plot) 
#c)

q4c_plot <- as.data.frame(q4c) %>%
  ggplot(aes(x = MEAN)) +
  geom_histogram(binwidth=0.01, fill = '#34495E') +
  theme(panel.background = 
            element_rect(fill ="white"),
        axis.title.y = element_text(color = '#17202A'),
        axis.title.x = element_text(color = '#17202A')) +

  xlab("Distribution of Means") +
  ylab("Count") 
  
ggplotly(q4c_plot) 

How could I create an alternative chart that overlapps the three distributions to show that they become more and more Gaussian? For example, how could I obtain a graph with three bell-shaped curves for each of the distributions?

Question 6

  # a)
  
  rm(list = ls())
  N = 10
  
  # b)
  
  x1 <- runif(N, min = 0, max = 1)
  x2 <- rbinom(N, 1, 0.3)
  u <- rchisq(N, df = 1)
  y <- 1 + 2 * x1 + 10 * x2 + u
  
  # c) 
  regression_q6 <- lm(y ~ x1 + x2) 
  
  # d)
  
  regression_q6$coefficients 
## (Intercept)          x1          x2 
##   2.4733578   0.2912243  12.0992154
  sd <- c(sd(x1),sd(x2),sd(u)) 
  
# Creating a function to generate data:
  
  function1_q6 <- function(N = 10){
    x1 <- runif(N, min = 0, max = 1)
    x2 <- rbinom(N, 1, 0.3)
    u <- rchisq(N, df = 1)
    y <- 1 + 2*x1 + 10*x2 + u
    regression_q6 <- lm(y ~ x1 + x2) 
    return(c(regression_q6$coefficients, "sd(x1)"=sd(x1), "sd(x2)"=sd(x2), "sd(u)"=sd(u)))
  }
  
# Second, a function to simulate this several times and collect the data.
  function2_q6 <- function(n=10000, N=10){
    df <- data.frame(matrix(0, ncol = 6, nrow = 0))
    names(df) <- c("Intercept",
                   "First coef",
                   "Second coef",
                   "Sd(x1)",
                   "Sd(x2)",
                   "Sd(u)")
    for (i in 1:n){
      d <- function1_q6(N)
      for (j in 1:6){
        df[i,j] <- d[j]
      }
    }
    return(df)
  }
  
  df <- function2_q6()

Question 7

#a)

  # With N=10:
  df_1 <- function2_q6(N=10) 
  summary(df_1[, 1]) # To summarize the intercept
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -6.342   1.323   1.827   1.999   2.518  14.046
  summary(df_1[,2]) # To summarize the first coefficient
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -13.009   1.042   2.018   2.017   2.988  18.703
  summary(df_1[,3]) # To summarize the second coefficient
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.913   9.334   9.891   9.997  10.558  23.032     278
  hist(df_1[, 1])

  hist(df_1[,2]) 

  hist(df_1[,3]) 

#b)

  # With N=1000:
  df_2 <- function2_q6(N=1000) 
  summary(df_2[, 1]) # To summarize the intercept
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.600   1.937   1.999   2.001   2.063   2.332
  summary(df_2[,2]) # To summarize the first coefficient
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.384   1.898   1.998   2.000   2.102   2.575
  summary(df_2[,3]) # To summarize the second coefficient
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   9.643   9.931   9.997   9.998  10.063  10.390
  hist(df_2[, 1])

  hist(df_2[,2]) 

  hist(df_2[,3]) 

  # With N=1000:
  df_3 <- function2_q6(N=10000) 
  summary(df_3[, 1]) # To summarize the intercept
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.886   1.979   2.000   2.000   2.020   2.117
  summary(df_3[,2]) # To summarize the first coefficient
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.818   1.967   2.000   2.000   2.034   2.193
  summary(df_3[,3]) # To summarize the second coefficient
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   9.886   9.979  10.000  10.000  10.020  10.121
  hist(df_3[, 1])

  hist(df_3[,2]) 

  hist(df_3[,3]) 

We can see that as we increase sample size, the estimates converge around the values we predetermined 2 and 10.

Can I add an option to the histogram to modify the label?

Question 8

  # a)
  
  rm(list = ls())
  N = 10
  
  # b)
  
  x1 <- runif(N, min = 0, max = 1)
  x2 <- rbinom(N, 1, 0.3)
  u <- rchisq(N, df = 1)
  y <- 1 + 2 * x1 + 10 * x2 + u
  
  # c) 
  
  regression_q8 <- lm(y ~ x1 + x2) 
  
 # d) 

    function1_q8 <- function(N = 10){
    x1 <- runif(N, min = 0, max = 1)
    x2 <- rbinom(N, 1, 0.3)
    u <- rchisq(N, df = 1)
    y <- 1 + 2*x1 + 10*x2 + u
    regression_q8 <- lm(y ~ x1 + x2) 
    summary_regq8 <- summary(regression_q8)
    return(c(regression_q8$coefficients, "sd(x1)"=sd(x1), "sd(x2)"=sd(x2), "sd(u)"=sd(u), summary_regq8$fstatistic))
  }
  
  # Second, a function to simulate this several times and collect the data.
  function2_q8 <- function(n=10000, N=10){
    df_q8 <- data.frame(matrix(0, ncol = 7, nrow = 0))
    names(df_q8) <- c("Intercept",
                   "First coef",
                   "Second coef",
                   "Sd(x1)",
                   "Sd(x2)",
                   "Sd(u)",
                   "F-stat")
    for (i in 1:n){
      d <- function1_q8(N)
      for (j in 1:7){
        df_q8[i,j] <- d[j]
      }
    }
    return(df_q8)
  }
  
  df_q8 <- function2_q8()
  
  summary(df_q8)
##    Intercept        First coef       Second coef         Sd(x1)       
##  Min.   :-5.038   Min.   :-13.616   Min.   : 1.686   Min.   :0.07118  
##  1st Qu.: 1.331   1st Qu.:  1.030   1st Qu.: 9.342   1st Qu.:0.25441  
##  Median : 1.826   Median :  2.014   Median : 9.887   Median :0.28672  
##  Mean   : 2.003   Mean   :  1.995   Mean   : 9.995   Mean   :0.28484  
##  3rd Qu.: 2.503   3rd Qu.:  2.954   3rd Qu.:10.553   3rd Qu.:0.31745  
##  Max.   :11.929   Max.   : 14.807   Max.   :18.825   Max.   :0.42010  
##                                     NA's   :290                       
##      Sd(x2)           Sd(u)             F-stat         
##  Min.   :0.0000   Min.   :0.09707   Min.   :    0.001  
##  1st Qu.:0.4216   1st Qu.:0.77403   1st Qu.:   34.624  
##  Median :0.4830   Median :1.11730   Median :   76.927  
##  Mean   :0.4474   Mean   :1.24991   Mean   :  176.528  
##  3rd Qu.:0.5164   3rd Qu.:1.58767   3rd Qu.:  174.690  
##  Max.   :0.5270   Max.   :6.16029   Max.   :17223.124  
## 
#8.1)

  # With N=10:
  df_q8_1 <- function2_q8(N=10) 
  summary(df_q8[, 1]) # To summarize the intercept
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -5.038   1.331   1.826   2.003   2.503  11.929
  summary(df_q8[,2]) # To summarize the first coefficient
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -13.616   1.030   2.014   1.995   2.954  14.807
  summary(df_q8[,3]) # To summarize the second coefficient
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.686   9.342   9.887   9.995  10.553  18.825     290
  summary(df_q8[, 7])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     0.001    34.624    76.927   176.528   174.690 17223.124
  hist(df_q8[, 1])

  hist(df_q8[,2]) 

  hist(df_q8[,3]) 

  hist(df_q8[, 7])

#8.2)

  # With N=1000:
  df_q8_2 <- function2_q8(N=1000) 
  summary(df_q8[, 1]) # To summarize the intercept
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -5.038   1.331   1.826   2.003   2.503  11.929
  summary(df_q8[,2]) # To summarize the first coefficient
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -13.616   1.030   2.014   1.995   2.954  14.807
  summary(df_q8[,3]) # To summarize the second coefficient
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.686   9.342   9.887   9.995  10.553  18.825     290
  summary(df_q8[, 7])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     0.001    34.624    76.927   176.528   174.690 17223.124
  hist(df_q8[, 1])

  hist(df_q8[,2]) 

  hist(df_q8[,3]) 

  hist(df_q8[, 7])

Question 9